import os
import time
from datetime import datetime
from selenium import webdriver
import csv

from selenium.common import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By


# 定义一个爬虫类
class BigArticlesSpider(object):
    # 初始化
    # 定义初始页面url
    def __init__(self):
        self.driver = None

    # 主函数
    def run(self, url="https://juejin.cn/"):
        # 大部分的优化手段都在options中进行设置
        chrome_options = Options()
        chrome_options.add_argument("--window-size=1920,1080")
        # chrome_options.add_argument("--disable-extensions")
        # chrome_options.add_argument("--headless")
        # chrome_options.add_argument("--disable-gpu")
        # chrome_options.add_argument("--disable-software-rasterizer")
        # chrome_options.add_argument('--no-sandbox')
        # chrome_options.add_argument('--ignore-certificate-errors')
        # chrome_options.add_argument('--allow-running-insecure-content')
        # chrome_options.add_argument("blink-settings=imagesEnabled=false")
        # 设置chromedriver，并打开webdriver  参数options=chrome_options
        self.driver = webdriver.Chrome(options=chrome_options)
        # self.driver = webdriver.Chrome()
        self.driver.get(url)
        self.driver.implicitly_wait(2)  # seconds

        # 此处写下 爬取哪块栏目 的函数
        self.sp_news()

        # 关闭webdriver
        self.driver.close()

    # 解析页面，数据清洗
    # 个人信息
    def sp_news(self):
        driver = self.driver

        # 先进行模拟滚动，加载所有数据
        js = "window.scrollTo(0,document.body.scrollHeight)"
        for count in range(0, 1200):
            driver.execute_script(js)
            time.sleep(1)

        all_articles = driver.find_elements(By.CSS_SELECTOR, '#juejin > div.view-container.container.index-container '
                                                             '> main > div > div.timeline-container > div > div > div'
                                                             ' > div > div > li')
        print("本轮爬取数据总长度：" + str(len(all_articles)))

        # 解析页面，获取数据
        big_articles = []
        count = 1
        for item in all_articles:

            print("当前解析位置：第 " + str(count) + "/" + str(len(all_articles)) + " 条")

            url = "NOTFOUND"
            title = "NOTFOUND"
            brief = "NOTFOUND"
            author = "NOTFOUND"
            likes = "NOTFOUND"
            watchs = "NOTFOUND"
            cover = "NOTFOUND"
            category = "NOTFOUND"

            try:
                url = item.find_element(By.CSS_SELECTOR, 'div > div > div > div.title-row > a').get_attribute("href")
            except NoSuchElementException:
                url = "NOTFOUND"

            try:
                title = item.find_element(By.CSS_SELECTOR, 'div > div > div > div.title-row > a').text
            except NoSuchElementException:
                title = "NOTFOUND"

            try:
                brief = item.find_element(By.CSS_SELECTOR, 'div > div > div > div.abstract > a > div').text
            except NoSuchElementException:
                brief = "NOTFOUND"

            try:
                author = item.find_element(By.CSS_SELECTOR,
                                           'div > div > div > div.entry-footer > ul > li.item.meta-container > a > div').text
            except NoSuchElementException:
                author = "NOTFOUND"

            try:
                likes = item.find_element(By.CSS_SELECTOR,
                                          'div > div > div > div.entry-footer > ul > li.item.like > span').text
            except NoSuchElementException:
                likes = "NOTFOUND"

            try:
                watchs = item.find_element(By.CSS_SELECTOR,
                                           'div > div > div > div.entry-footer > ul > li.item.view > span').text
            except NoSuchElementException:
                watchs = "NOTFOUND"

            try:
                cover = item.find_element(By.CSS_SELECTOR, 'div > div > img').get_attribute("src")
            except NoSuchElementException:
                cover = "NOTFOUND"

            try:
                category = item.find_element(By.CSS_SELECTOR, 'div > div > div > div.entry-footer > div').text
            except NoSuchElementException:
                category = "NOTFOUND"

            big_articles.append([
                url,
                title,
                brief,
                author,
                likes,
                watchs,
                cover,
                category,
            ])

            count = count + 1

        # 保存数据
        # 判断文件是否存在
        if os.path.exists("csv_collect/big_articles.csv"):
            with open("csv_collect/big_articles.csv", "a+", newline="", encoding="utf-8") as f:
                # 生成csv操作对象
                writer = csv.writer(f)
                for article in big_articles:
                    writer.writerow(article)
        else:
            with open("csv_collect/big_articles.csv", "a+", newline="", encoding="utf-8") as f:
                # 生成csv操作对象
                writer = csv.writer(f)
                # 制作表头
                csv_title = [
                    "url",
                    "title",
                    "brief",
                    "author",
                    "likes",
                    "watchs",
                    "cover",
                    "category",
                ]

                writer.writerow(csv_title)
                for article in big_articles:
                    writer.writerow(article)

        # 控制台提示信息
        print("---------BigArticlesSpider----爬取综合文章成功----------")


# 以脚本方式启动
if __name__ == "__main__":
    # 捕捉异常错误
    try:
        spider = BigArticlesSpider()
        spider.run()
    except Exception as e:
        print("错误:", e)
